Accreditation exercise for 16S rRNA data analysis

This document entails R code used for downstream analysis of 16s rRNA data used for accreditation exercise. Some bits of this are inspired by https://github.com/grbot/16SrRNA-hex-tutorial/tree/master/downstream.

Note: Tabular results that are not shown in the report are available here in addition to number and percentage of reads lost at each sub-stage of denoising data using dada2 (Only visual representation provided in report).

Pre-processing

Cleaning the taxonomy and feature abundance file.

sed 's/#OTU//' feature-table.tsv > table.tsv
asvs_abundance <- read.table("table.tsv", header = TRUE, row.names = "ID")
# dim(asvs_abundance)
sed 's/\[//g' taxonomy.tsv | sed 's/\]//g' > tax.tsv
asv_taxonomy <- read.delim("tax.tsv", row.names = 1)

asv_taxonomy[] <- lapply(asv_taxonomy, function(x) gsub("D_0__|D_1__|D_2__|D_3__|D_4__|D_5__|D_6__|D_7__|D_8__|D_9__|D_10__|D_11__|D_12__|D_13__|D_14__", 
    "", x))
asv_taxonomy <- asv_taxonomy[asv_taxonomy$Confidence > 0.97, ]  #consider 97% classification confidence threshold
# Re-organise taxonomy dataframe to have seven columns each with single
# taxonomic rank
require(splitstackshape)
tmp <- cSplit(asv_taxonomy, "Taxon", ";")
tmp <- tmp[, -1]  #remove confidence column
tmp <- tmp[, 1:7]  #pick first seven columns
names(tmp) <- c("Kingdom", "Phylum", "Class", "Order", "Family", "Genus", "Species")  #assign new column names
tax <- data.frame(tmp, row.names = rownames(asv_taxonomy))
tax <- tax[tax$Phylum != "" & !is.na(tax$Phylum), ]  #remove ASVs without Phylum level classification
tax$Species[is.na(tax$Species)] <- ""
tax$Genus[is.na(tax$Genus)] <- ""
tax$G_species <- paste(tax$Genus, tax$Species, sep = " ")  #create a new taxonomic rank (species and genus for which species is not assigned)

Load phylogenetic tree

require(ape)
asv_tree <- read_tree("tree.nwk")

Construct a phyloseq object, composed of ASV abundance, taxonomy, tree and meta data.

require(phyloseq)
ASV <- otu_table(as.matrix(asvs_abundance), taxa_are_rows = T)
TAX <- tax_table(as.matrix(tax))
meta_data <- read.delim("set2_meta.txt", row.names = 1)
meta_data$sample <- row.names(meta_data)
meta_data$BV <- as.factor(meta_data$BV)
meta_data$Inflammation <- as.factor(meta_data$Inflammation)
SAM <- sample_data(meta_data)
tree <- compute.brlen(asv_tree, method = "Grafen")
physeq <- merge_phyloseq(phyloseq(ASV, TAX), SAM, tree)

Obtain number and percentage of ASVs identified at each taxonomic rank

sp <- dim(tax[!is.na(tax$Species), ])[1]  #Species level
sp/dim(tax)[1]
gp <- dim(tax[!is.na(tax$Genus) & tax$Genus != "", ])[1]  #Genus level
gp/dim(tax)[1]
fm <- dim(tax[!is.na(tax$Family) & tax$Family != "", ])[1]  #Family level
fm/dim(tax)[1]
saveRDS(physeq, "physeq_silva_972")  #save the phyloseq object for future use

From now onwards, we use the previous saved phyloseq object for downstream analysis. The phyloseq object used downstream is available at https://github.com/galaxyuvri-ea/16S-rRNA-analysis/raw/master/dataFiles/physeq_silva_972.

require(kableExtra)
require(phyloseq)
phy <- readRDS("physeq_silva_972")
meta <- data.frame(sample_data(phy))
meta %>% kable(booktabs = TRUE, longtable = TRUE) %>% kable_styling(latex_options = c("hold_position", 
    "repeat_header", position = "")) %>% scroll_box(width = "100%", height = "300px")
Inflammation BV Age BMI sample
S1 Low Negative 19 22.48100 S1
S100 High Positive 17 27.82931 S100
S101 High Positive 19 NA S101
S102 High Negative 18 28.95900 S102
S103 High Negative 20 26.95312 S103
S104 Low Negative 18 27.05380 S104
S105 Low Negative 18 20.69049 S105
S106 High Positive 17 21.21832 S106
S107 High Negative 22 22.49964 S107
S108 High Positive 20 NA S108
S109 High Positive 19 34.85018 S109
S11 Low Negative 20 18.87100 S11
S111 Low Negative 22 23.35564 S111
S112 High Positive 21 24.65483 S112
S114 High Positive 20 29.27796 S114
S115 High Positive 19 28.54828 S115
S116 High Positive 20 21.77844 S116
S117 High Positive 20 29.51594 S117
S118 High Positive 19 22.58271 S118
S119 High Negative 17 17.79993 S119
S120 High Negative 22 35.10667 S120
S121 High Positive 17 31.55556 S121
S122 Low Negative 17 25.39062 S122
S123 High Positive 16 21.05171 S123
S124 High Positive 17 25.21736 S124
S126 High Positive 17 27.54821 S126
S127 Low Positive 20 30.47052 S127
S128 High Negative 17 19.14672 S128
S129 Low Negative 16 22.32143 S129
S130 Low Negative 16 26.40235 S130
S131 Low Negative 19 25.15315 S131
S132 NA Positive 16 21.09619 S132
S134 Low Negative 20 18.13122 S134
S135 High Positive 19 30.48780 S135
S137 Low Negative 18 28.36035 S137
S138 High Positive 16 18.61150 S138
S139 High Positive 18 27.25089 S139
S14 Low Negative NA 29.01700 S14
S140 Low Negative 18 21.87500 S140
S141 High Positive 20 27.55556 S141
S142 Low Negative 20 22.49964 S142
S143 High Positive 18 NA S143
S145 Low Negative 18 28.44444 S145
S15 Low Negative NA 16.18400 S15
S16 Low Negative 16 21.35800 S16
S17 Low Negative 17 27.66000 S17
S18 Low Negative 16 30.35900 S18
S2 Low Positive NA NA S2
S21 High Negative 17 22.50000 S21
S24 Low Negative 16 31.08000 S24
S25 Low Positive NA 20.96400 S25
S26 Low Negative 16 19.23400 S26
S28 Low Negative 17 24.84100 S28
S29 Low Negative 17 20.83000 S29
S3 High Negative 18 22.15100 S3
S31 High Positive 18 19.48700 S31
S33 High Negative 21 34.96400 S33
S34 High Positive 21 18.80800 S34
S35 High Positive NA 22.03200 S35
S37 High Positive NA 22.86200 S37
S38 Low Negative 17 23.55600 S38
S39 Low Negative 17 26.02600 S39
S4 High Positive 19 29.37200 S4
S40 Low Negative 20 25.96500 S40
S42 High Positive 16 23.50800 S42
S43 Low Negative 17 26.39800 S43
S44 Low Negative 16 22.89300 S44
S46 Low Negative 16 20.56900 S46
S47 Low Positive 18 34.92800 S47
S48 Low Negative 18 23.62400 S48
S50 High Negative NA 20.44900 S50
S51 High Negative 19 22.47700 S51
S52 High Positive 21 16.75700 S52
S53 Low Negative 18 22.60000 S53
S54 High Positive 19 40.00900 S54
S55 Low Positive 19 24.46500 S55
S56 Low Negative 18 18.31400 S56
S57 Low Positive 21 21.33800 S57
S58 High Positive 19 21.33800 S58
S59 High Positive 21 21.33800 S59
S60 Low Positive 19 23.04700 S60
S61 Low Positive 19 26.49200 S61
S62 High Positive 19 21.60400 S62
S63 High Positive 18 21.21800 S63
S64 High Positive 22 19.83471 S64
S65 High Negative 19 NA S65
S66 Low Negative 19 36.10694 S66
S67 Low Negative 18 24.80159 S67
S68 High Positive 19 22.98145 S68
S69 High Positive 17 26.03749 S69
S71 High Positive 19 33.05785 S71
S72 Low Positive 20 30.07598 S72
S74 High Negative 18 27.35885 S74
S75 High Negative 18 20.07775 S75
S77 High Negative 18 26.31464 S77
S78 Low Positive 18 21.71925 S78
S79 High Positive 21 30.17882 S79
S8 NA Negative 19 28.76400 S8
S80 High Negative 19 19.72387 S80
S81 Low Negative 20 28.35306 S81
S82 High Positive 18 23.42209 S82
S83 High Positive 20 25.00000 S83
S84 High Positive 18 20.90420 S84
S85 High Positive 19 25.72242 S85
S86 Low Negative 19 19.47341 S86
S88 Low Negative 17 22.83288 S88
S89 High Positive 18 25.71101 S89
S9 Low Negative 22 21.75500 S9
S90 High Negative 18 29.27796 S90
S92 Low Negative 20 20.83000 S92
S93 High Positive 18 36.64982 S93
S94 High Positive 16 23.82812 S94
S95 High Negative 17 25.14861 S95
S96 High Positive 16 28.87544 S96
S97 High Positive 17 17.36044 S97
S98 High Positive 18 21.07720 S98

Denosing stats

require(kableExtra)
stats <- read.table("data/stats.tsv", header = T)
stats %>% kable(booktabs = TRUE, longtable = TRUE) %>% kable_styling(latex_options = c("hold_position", 
    "repeat_header", position = "")) %>% scroll_box(width = "100%", height = "300px")
sample.id input filtered denoised merged non.chimeric
S1 108494 71218 71218 64963 64963
S100 126661 48555 48555 47328 47328
S101 150614 107251 107251 106154 106154
S102 146772 121474 121474 119994 119994
S103 366244 305258 305258 301238 301238
S104 339681 273582 273582 266178 266178
S105 176537 130659 130659 127538 127538
S106 409963 322929 322929 315569 315569
S107 162844 104936 104936 102849 102849
S108 320033 252510 252510 249186 249186
S109 322310 260472 260472 256255 256255
S11 111790 82721 82721 76572 76572
S111 282144 239108 239108 236964 236964
S112 49460 36709 36709 36462 36462
S114 331129 274093 274093 269072 269072
S115 109200 88749 88749 87596 87596
S116 205450 141207 141207 138321 138321
S117 254884 206061 206061 201946 201946
S118 28687 14662 14662 14361 14361
S119 350937 234338 234338 226540 226540
S120 1532253 1281379 1281379 1263436 1263436
S121 345921 252646 252646 248813 248813
S122 353419 295442 295442 292978 292978
S123 225713 179217 179217 176567 176567
S124 140088 106449 106449 102541 102541
S126 380866 285061 285061 280926 280926
S127 536496 422807 422807 417772 417772
S128 210601 175092 175092 174022 174022
S129 90546 68813 68813 68253 68253
S130 387131 319617 319617 313316 313316
S131 217614 170242 170242 167290 167290
S132 133395 89377 89377 84978 84978
S134 141947 100908 100908 93233 93233
S135 303167 251788 251788 247979 247979
S137 83077 53328 53328 49049 49049
S138 110492 80605 80605 77501 77501
S139 361408 302110 302110 298271 298271
S14 215017 183251 183251 167757 167757
S140 33079 22371 22371 21430 21430
S141 161665 112579 112579 108088 108088
S142 45924 34647 34647 32183 32183
S143 301794 255139 255139 251130 251130
S145 106428 78663 78663 73366 73366
S15 207191 168396 168396 153352 153352
S16 51951 40834 40834 37945 37945
S17 226827 184346 184346 168521 168521
S18 70709 44884 44884 41018 41018
S2 258746 193555 193555 172767 172767
S21 14155 11069 11069 10453 10453
S24 104447 75697 75697 70193 70193
S25 154810 119846 119846 111091 111091
S26 134898 103978 103978 95340 95340
S28 54256 43261 43261 40348 40348
S29 92226 73108 73108 68153 68153
S3 29696 21821 21821 20087 20087
S31 98386 81770 81770 78054 78054
S33 30434 24556 24556 23666 23666
S34 231852 173395 173395 156773 156773
S35 139023 111869 111869 102136 102136
S37 186767 136813 136813 122851 122851
S38 155810 101964 101964 92594 92594
S39 114609 89616 89616 79116 79116
S4 176262 144419 144419 135466 135466
S40 102954 72076 72076 66254 66254
S42 161516 130610 130610 121306 121306
S43 26710 22748 22748 21109 21109
S44 89349 64294 64294 58984 58984
S46 96397 69206 69206 64897 64897
S47 188461 156130 156130 147590 147590
S48 31718 19839 19839 18481 18481
S50 114256 88893 88893 81813 81813
S51 103734 77405 77405 71159 71159
S52 17009 12460 12460 11937 11937
S53 160395 129052 129052 118501 118501
S54 193042 155351 155351 145664 145664
S55 183286 142289 142289 129107 129107
S56 133249 93221 93221 87063 87063
S57 229468 143550 143550 126131 126131
S58 107133 84806 84806 80997 80997
S59 203605 168385 168385 157877 157877
S60 138892 100992 100992 94710 94710
S61 172758 138556 138556 130388 130388
S62 228758 177554 177554 159343 159343
S63 131697 92289 92289 86373 86373
S64 151183 114620 114620 113429 113429
S65 228293 178060 178060 176375 176375
S66 106851 68507 68507 64296 64296
S67 247723 205893 205893 203492 203492
S68 70210 57256 57256 56408 56408
S69 197006 150993 150993 148130 148130
S71 167183 124266 124266 122163 122163
S72 160242 120510 120510 115616 115616
S74 156178 107160 107160 105434 105434
S75 207264 164096 164096 162241 162241
S77 117901 84577 84577 83246 83246
S78 231741 172630 172630 170057 170057
S79 140625 100372 100372 99358 99358
S8 118954 65831 65831 60053 60053
S80 96400 27251 27251 26944 26944
S81 46643 22578 22578 22018 22018
S82 312725 247048 247048 244371 244371
S83 217518 177949 177949 168845 168845
S84 270208 232543 232543 230945 230945
S85 342197 246347 246347 238033 238033
S86 80376 49979 49979 48693 48693
S88 42682 26392 26392 24403 24403
S89 176154 139611 139611 137884 137884
S9 79870 68637 68637 63332 63332
S90 66713 35239 35239 31476 31476
S92 74331 48971 48971 48454 48454
S93 268130 212003 212003 206614 206614
S94 93202 43708 43708 43394 43394
S95 31443 12006 12006 11667 11667
S96 227880 183185 183185 181108 181108
S97 317252 268332 268332 266395 266395
S98 396734 313305 313305 309626 309626

Obtain average and percentage of reads retained after each sub-process of dada2

mean(stats$input)
[1] 185871.8
mean(stats$filtered)
[1] 143041.2
mean(stats$merged)
[1] 137931.7
mean(stats$non.chimeric)
[1] 137931.7
lost_filt <- (mean(stats$input) - mean(stats$filtered))/mean(stats$input)
lost_filt
[1] 0.2304308
lost_merge <- (mean(stats$filtered) - mean(stats$merged))/mean(stats$filtered)
lost_merge
[1] 0.03572084
lost_chime <- (mean(stats$merged) - mean(stats$non.chimeric))/mean(stats$merged)
lost_chime
[1] 0
lost_overall <- (mean(stats$input) - mean(stats$non.chimeric))/mean(stats$input)
lost_overall
[1] 0.2579204


require(plotly)
df <- stats[, -c(1)]  #sample ids
p <- ggplot(stack(df), aes(x = ind, y = values)) + geom_boxplot(col = "blue", 
    outline = FALSE) + theme_bw() + ylab("Number of reads") + xlab("")
ggplotly(p)

Detected ASvs

df <- as.data.frame(sample_sums(phy))
colnames(df) <- c("Number_of_ASVs")
df %>% kable(booktabs = TRUE, longtable = TRUE) %>% kable_styling(latex_options = c("hold_position", 
    "repeat_header", position = "")) %>% scroll_box(width = "100%", height = "200px")
Number_of_ASVs
S1 61063
S100 44511
S101 96603
S102 82465
S103 187655
S104 255049
S105 117233
S106 292794
S107 101782
S108 207574
S109 207872
S11 73001
S111 227189
S112 33625
S114 203251
S115 52786
S116 101199
S117 180747
S118 11031
S119 155800
S120 1044629
S121 210655
S122 286507
S123 140107
S124 80418
S126 253049
S127 387407
S128 171355
S129 64913
S130 279534
S131 165378
S132 68077
S134 80024
S135 189047
S137 34106
S138 67806
S139 227654
S14 164310
S140 20488
S141 81609
S142 27438
S143 191231
S145 55640
S15 123342
S16 27934
S17 152074
S18 36082
S2 128524
S21 9660
S24 68538
S25 89019
S26 92954
S28 39466
S29 60813
S3 15803
S31 67170
S33 21611
S34 127017
S35 92412
S37 101523
S38 88239
S39 71791
S4 124413
S40 53615
S42 107510
S43 20449
S44 55475
S46 60012
S47 116267
S48 4169
S50 76754
S51 67641
S52 9923
S53 117037
S54 120593
S55 116434
S56 85267
S57 20962
S58 72234
S59 130361
S60 80710
S61 116822
S62 117170
S63 74284
S64 103070
S65 167208
S66 41735
S67 201104
S68 35786
S69 92742
S71 94256
S72 88623
S74 92403
S75 140253
S77 76904
S78 140787
S79 80142
S8 24153
S80 23839
S81 20228
S82 177630
S83 148485
S84 227887
S85 181698
S86 38291
S88 22225
S89 111213
S9 62775
S90 16359
S92 46453
S93 173129
S94 37394
S95 4068
S96 110212
S97 261993
S98 287978


Filtering samples

phy <- subset_samples(phy, !is.na(Inflammation))  #remove samples for which without values for inflammation
phy <- subset_samples(phy, sample_sums(phy) > 5000)  #retain samples with 5000 and more reads.
phy <- subset_samples(phy, sample != "S120")  #remove outlier sample 
phy
phyloseq-class experiment-level object
otu_table()   OTU Table:         [ 3222 taxa and 111 samples ]
sample_data() Sample Data:       [ 111 samples by 5 sample variables ]
tax_table()   Taxonomy Table:    [ 3222 taxa by 8 taxonomic ranks ]
phy_tree()    Phylogenetic Tree: [ 3222 tips and 3218 internal nodes ]

Standardising samples

total <- median(sample_sums(phy))
standf <- function(x, t = total) round(t * (x/sum(x)))
M.std <- transform_sample_counts(phy, standf)

M.std <- filter_taxa(M.std, function(x) sum(x > 10) > (0.01 * length(x)) | sum(x) > 
    0.001 * total, TRUE)
ntaxa(M.std)
[1] 1075

Unsupervised clustering

require(cluster)
library(factoextra)
df1 <- data.frame(otu_table(M.std))
df_viz <- fviz_nbclust(df1, kmeans, method = "wss") + theme_minimal()
res_fanny <- fanny(t(df1), k = 3, metric = "SqEuclidean")
sample_data(M.std)$cluster <- as.character(res_fanny$clustering)
require(microbiomeSeq)
require(pheatmap)
phyTop <- taxa_level(M.std, "G_species")  #reduce dataset to species and Genus rank
TopASVs <- names(sort(taxa_sums(phyTop), TRUE))[1:50]  #pick 50 most abundant taxa
df1 <- as.data.frame(otu_table(t(phyTop)))
df1 <- df1[which(rownames(df1) %in% TopASVs), ]
df1 <- df1[which(rownames(df1) != " "), ]
df <- data.frame((sample_data(phyTop))[, c("Inflammation", "BV")])
p <- pheatmap(log2(df1 + 1), cluster_rows = FALSE, show_rownames = TRUE, show_colnames = F, 
    cluster_cols = TRUE, annotation_col = df, annotation_row = NA)
p

Alpha diversity

require(plotly)
p <- plot_richness(M.std, x = "cluster", color = "cluster", measures = c("Observed", 
    "Simpson"), title = paste0("Standardized to total reads"))
p <- p + geom_boxplot() + geom_point() + theme_bw() + theme(axis.text = element_text(size = 10, 
    face = "bold"), axis.title = element_text(size = 16, face = "bold")) + geom_point(size = 2)
ggplotly(p)
p <- plot_richness(M.std, x = "Inflammation", color = "Inflammation", measures = c("Observed", 
    "Simpson"), title = paste0("Standardized to total reads"))
p <- p + geom_boxplot() + geom_point() + theme_bw() + theme(axis.text = element_text(size = 10, 
    face = "bold"), axis.title = element_text(size = 16, face = "bold")) + geom_point(size = 2)
ggplotly(p)
p <- plot_richness(M.std, x = "BV", color = "BV", measures = c("Observed", "Simpson"), 
    title = paste0("Standardized to total reads"))
p <- p + geom_boxplot() + geom_point() + theme_bw() + theme(axis.text = element_text(size = 10, 
    face = "bold"), axis.title = element_text(size = 16, face = "bold")) + geom_point(size = 2)
ggplotly(p)
est <- estimate_richness(M.std, split = TRUE, measures = c("Simpson"))
BV_div <- cbind(est, sample_data(M.std)[, "BV"])
t <- wilcox.test(BV_div$Simpson ~ BV_div$BV)
t

    Wilcoxon rank sum test with continuity correction

data:  BV_div$Simpson by BV_div$BV
W = 398, p-value = 1.71e-11
alternative hypothesis: true location shift is not equal to 0
BInflammation_div <- cbind(est, sample_data(M.std)[, "Inflammation"])
t <- wilcox.test(BInflammation_div$Simpson ~ BInflammation_div$Inflammation)
t

    Wilcoxon rank sum test with continuity correction

data:  BInflammation_div$Simpson by BInflammation_div$Inflammation
W = 2204, p-value = 3.854e-05
alternative hypothesis: true location shift is not equal to 0
Cluster_div <- cbind(est, sample_data(M.std)[, "cluster"])
dunn.test::dunn.test(Cluster_div$Simpson, Cluster_div$cluster)
  Kruskal-Wallis rank sum test

data: x and group
Kruskal-Wallis chi-squared = 58.3445, df = 2, p-value = 0

                           Comparison of x by group                            
                                (No adjustment)                                
Col Mean-|
Row Mean |          1          2
---------+----------------------
       2 |  -5.906293
         |    0.0000*
         |
       3 |  -0.080297   6.501771
         |     0.4680    0.0000*

alpha = 0.05
Reject Ho if p <= alpha/2

Beta diversity

# ordination based on NMDS and bray-curtis distance
NMDS_bray <- ordinate(M.std, "NMDS")
Square root transformation
Wisconsin double standardization
Run 0 stress 0.2055082 
Run 1 stress 0.2309574 
Run 2 stress 0.217807 
Run 3 stress 0.2221731 
Run 4 stress 0.1968815 
... New best solution
... Procrustes: rmse 0.0485541  max resid 0.2755098 
Run 5 stress 0.222773 
Run 6 stress 0.2169525 
Run 7 stress 0.2262007 
Run 8 stress 0.226073 
Run 9 stress 0.2171289 
Run 10 stress 0.2318284 
Run 11 stress 0.2278323 
Run 12 stress 0.2134497 
Run 13 stress 0.2192831 
Run 14 stress 0.2118285 
Run 15 stress 0.2277937 
Run 16 stress 0.2111281 
Run 17 stress 0.2105766 
Run 18 stress 0.4130426 
Run 19 stress 0.2136909 
Run 20 stress 0.2176685 
*** No convergence -- monoMDS stopping criteria:
    20: stress ratio > sratmax
title = c("NMDS of 16S microbiome, Bray-Curtis distance")

NMDS.bray.plot <- plot_ordination(M.std, NMDS_bray, color = "BV", shape = "Inflammation", 
    title = title)
NMDS.bray.plot <- NMDS.bray.plot + theme(axis.text = element_text(size = 16, 
    face = "bold"), axis.title = element_text(size = 18, face = "bold"), legend.title = element_text(size = 14)) + 
    theme_bw() + labs(color = "BV") + geom_point(size = 5)
NMDS.bray.plot

PCoA.ord.bray <- ordinate(M.std, "PCoA", "bray")
title = c("PCoA of 16S microbiome, Bray-Curtis distance")
PCoA.ord <- plot_ordination(M.std, PCoA.ord.bray, color = "cluster", shape = "BV", 
    title = title)
PCoA.ord <- PCoA.ord + theme(axis.text = element_text(size = 16, face = "bold"), 
    axis.title = element_text(size = 18, face = "bold"), legend.title = element_text(size = 14)) + 
    theme_bw() + labs(color = "cluster") + geom_point(size = 5)
PCoA.ord

# permanova
require(vegan)
phy_bray <- phyloseq::distance(M.std, method = "bray")
sampledf <- data.frame(sample_data(M.std))

adonis(phy_bray ~ BV, data = sampledf)

Call:
adonis(formula = phy_bray ~ BV, data = sampledf) 

Permutation: free
Number of permutations: 999

Terms added sequentially (first to last)

           Df SumsOfSqs MeanSqs F.Model      R2 Pr(>F)    
BV          1     7.396  7.3956  27.198 0.19969  0.001 ***
Residuals 109    29.639  0.2719         0.80031           
Total     110    37.035                 1.00000           
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
beta <- betadisper(phy_bray, sampledf$BV)
permutest(beta)

Permutation test for homogeneity of multivariate dispersions
Permutation: free
Number of permutations: 999

Response: Distances
           Df  Sum Sq   Mean Sq      F N.Perm Pr(>F)
Groups      1 0.00977 0.0097732 0.5461    999  0.469
Residuals 109 1.95083 0.0178975                     
adonis(phy_bray ~ Inflammation, data = sampledf)

Call:
adonis(formula = phy_bray ~ Inflammation, data = sampledf) 

Permutation: free
Number of permutations: 999

Terms added sequentially (first to last)

              Df SumsOfSqs MeanSqs F.Model      R2 Pr(>F)    
Inflammation   1     3.676  3.6759  12.011 0.09926  0.001 ***
Residuals    109    33.359  0.3060         0.90074           
Total        110    37.035                 1.00000           
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
beta <- betadisper(phy_bray, sampledf$Inflammation)
permutest(beta)

Permutation test for homogeneity of multivariate dispersions
Permutation: free
Number of permutations: 999

Response: Distances
           Df  Sum Sq   Mean Sq      F N.Perm Pr(>F)
Groups      1 0.00025 0.0002487 0.0177    999  0.904
Residuals 109 1.53154 0.0140509                     
adonis(phy_bray ~ cluster, data = sampledf)

Call:
adonis(formula = phy_bray ~ cluster, data = sampledf) 

Permutation: free
Number of permutations: 999

Terms added sequentially (first to last)

           Df SumsOfSqs MeanSqs F.Model      R2 Pr(>F)    
cluster     2    17.856  8.9278  50.273 0.48213  0.001 ***
Residuals 108    19.179  0.1776         0.51787           
Total     110    37.035                 1.00000           
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
beta <- betadisper(phy_bray, sampledf$BV)
permutest(beta)

Permutation test for homogeneity of multivariate dispersions
Permutation: free
Number of permutations: 999

Response: Distances
           Df  Sum Sq   Mean Sq      F N.Perm Pr(>F)
Groups      1 0.00977 0.0097732 0.5461    999  0.448
Residuals 109 1.95083 0.0178975                     

Multivariate logistic regression

sampledf$cluster <- as.factor(sampledf$cluster)
logit_model <- glm(Inflammation ~ cluster + BMI + Age + BV, family = binomial(link = "logit"), 
    data = sampledf)
summary(logit_model)

Call:
glm(formula = Inflammation ~ cluster + BMI + Age + BV, family = binomial(link = "logit"), 
    data = sampledf)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-1.8491  -0.6003  -0.5532   0.8470   1.9964  

Coefficients:
            Estimate Std. Error z value Pr(>|z|)   
(Intercept)  1.33330    3.24810   0.410  0.68145   
cluster2    -0.85151    0.88297  -0.964  0.33486   
cluster3    -0.61949    0.68378  -0.906  0.36495   
BMI          0.02086    0.05698   0.366  0.71434   
Age         -0.02631    0.15561  -0.169  0.86574   
BVPositive  -2.22065    0.76632  -2.898  0.00376 **
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 137.19  on 99  degrees of freedom
Residual deviance: 101.90  on 94  degrees of freedom
  (11 observations deleted due to missingness)
AIC: 113.9

Number of Fisher Scoring iterations: 4
exp(logit_model$coefficients)
(Intercept)    cluster2    cluster3         BMI         Age  BVPositive 
  3.7935318   0.4267703   0.5382209   1.0210768   0.9740343   0.1085389 
exp(confint(logit_model))
                  2.5 %       97.5 %
(Intercept) 0.005649307 2196.3498153
cluster2    0.075719698    2.6415210
cluster3    0.132252489    2.0069491
BMI         0.912317941    1.1434501
Age         0.717828619    1.3306259
BVPositive  0.020596434    0.4536472
Wald test:
----------

Chi-squared test:
X2 = 1.1, df = 2, P(> X2) = 0.57

Differential abundance testing

# Differential abundance using DESEQ2
require(DESeq2)
M.f.des <- taxa_level(M.std, "G_species")
dds <- phyloseq_to_deseq2(M.f.des, ~Inflammation)
# geometric mean, set to zero when all coordinates are zero
geo_mean_protected <- function(x) {
    if (all(x == 0)) {
        return(0)
    }
    exp(mean(log(x[x != 0])))
}
geoMeans <- apply(counts(dds), 1, geo_mean_protected)
dds <- estimateSizeFactors(dds, geoMeans = geoMeans)
dds <- DESeq(dds, fitType = "local")
res = results(dds, cooksCutoff = FALSE)
df <- as.data.frame(colData(dds)[, "BV"])
colnames(df) <- "BV"

alpha = 0.01
sigtab = as.data.frame(res[which(res$padj < alpha & abs(res$log2FoldChange) > 
    2), ])
# sigtab = cbind(as(sigtab, 'data.frame'),
# as(tax_table(M.f.des)[rownames(sigtab), ], 'matrix'))
dim(sigtab)
[1] 14  6
table_sig <- sigtab[c("log2FoldChange", "padj")]
table_sig$Abundant_Group <- levels(df$BV)[as.numeric(sigtab$log2FoldChange > 
    0) + 1]
table_sig %>% kable(booktabs = TRUE, longtable = TRUE) %>% kable_styling(latex_options = c("hold_position", 
    "repeat_header", position = "")) %>% scroll_box(width = "100%", height = "300px")
log2FoldChange padj Abundant_Group
Sneathia -3.546803 0.0078426 Negative
Sneathia Sneathia amnii -2.258075 0.0016651 Negative
Streptococcus Streptococcus anginosus subsp. anginosus 3.480048 0.0078426 Positive
Gemella Gemella asaccharolytica -2.803589 0.0016651 Negative
Lactobacillus 2.150335 0.0056942 Positive
Anaerococcus uncultured Anaerococcus sp. 2.084764 0.0078426 Positive
Moryella Moryella sp. KHD1 -3.394192 0.0090199 Negative
Fastidiosipila unidentified -3.837239 0.0008019 Negative
Shuttleworthia uncultured bacterium -5.976390 0.0000000 Negative
Megasphaera -3.245417 0.0000000 Negative
Sutterella Sutterella sp. Marseille-P3161 -5.566145 0.0056942 Negative
Porphyromonas Bacteroidales bacterium KA00251 -4.642502 0.0001815 Negative
Prevotella Prevotella sp. DNF00663 -3.944970 0.0066978 Negative
Prevotella -4.997841 0.0000000 Negative
library("pheatmap")
select <- order(rowMeans(counts(dds, normalized = TRUE)), decreasing = TRUE)[1:50]
df <- as.data.frame(colData(dds)[, c("Inflammation", "BV", "cluster")])

df1 <- data.frame(assay(dds))[select, ]
p <- pheatmap(log2(df1 + 1), cluster_rows = FALSE, show_rownames = TRUE, show_colnames = F, 
    cluster_cols = TRUE, annotation_col = df, annotation_row = NA)
p

Random forest classification

require(randomForest)
data.rf <- data.frame(otu_table(M.f.des))
data.rf$BV <- sample_data(M.std)$BV
BV.rf <- randomForest(BV ~ ., data = data.rf, importance = TRUE, proximity = TRUE)
df_accuracy <- as.data.frame(BV.rf$importance)
df_accuracy$Taxa <- rownames(df_accuracy)
df_accuracy <- df_accuracy[order(df_accuracy$MeanDecreaseAccuracy, decreasing = TRUE), 
    ][1:20, ]
df_accuracy$Taxa <- factor(df_accuracy$Taxa, levels = df_accuracy$Taxa)
mda_plot <- ggplot(data = df_accuracy, aes(x = Taxa, y = MeanDecreaseAccuracy)) + 
    theme_bw()
mda_plot <- mda_plot + geom_bar(stat = "identity", fill = "darkblue", width = 0.5)
mda_plot <- mda_plot + theme(axis.text.x = element_text(angle = 90, hjust = 1, 
    vjust = 0.5))
mda_plot <- mda_plot + xlab("") + ylab("Mean Decrease in Accuracy") + coord_flip() + 
    theme(axis.text = element_text(size = 10, face = "bold"), axis.title = element_text(size = 16, 
        face = "bold"))
mda_plot

Session Info

R version 3.4.1 (2017-06-30)
Platform: x86_64-apple-darwin15.6.0 (64-bit)
Running under: macOS Sierra 10.12.6

Matrix products: default
BLAS: /Library/Frameworks/R.framework/Versions/3.4/Resources/lib/libRblas.0.dylib
LAPACK: /Library/Frameworks/R.framework/Versions/3.4/Resources/lib/libRlapack.dylib

locale:
[1] C

attached base packages:
[1] stats4    parallel  stats     graphics  grDevices utils     datasets 
[8] methods   base     

other attached packages:
 [1] randomForest_4.6-14        DESeq2_1.16.1             
 [3] SummarizedExperiment_1.6.5 DelayedArray_0.2.7        
 [5] matrixStats_0.54.0         Biobase_2.36.2            
 [7] GenomicRanges_1.28.6       GenomeInfoDb_1.12.3       
 [9] IRanges_2.10.5             S4Vectors_0.14.7          
[11] BiocGenerics_0.22.1        aod_1.3.1                 
[13] vegan_2.5-4                lattice_0.20-38           
[15] permute_0.9-5              pheatmap_1.0.12           
[17] microbiomeSeq_0.1          factoextra_1.0.5          
[19] cluster_2.0.7-1            plotly_4.9.0              
[21] ggplot2_3.2.0              phyloseq_1.25.2           
[23] kableExtra_1.1.0           rmdformats_0.3.5          
[25] knitr_1.23                

loaded via a namespace (and not attached):
 [1] backports_1.1.4      uuid_0.1-2           Hmisc_4.2-0         
 [4] plyr_1.8.4           igraph_1.2.4         lazyeval_0.2.2      
 [7] sp_1.3-1             splines_3.4.1        crosstalk_1.0.0     
[10] BiocParallel_1.10.1  rncl_0.8.3           digest_0.6.19       
[13] foreach_1.4.4        htmltools_0.3.6      gdata_2.18.0        
[16] memoise_1.1.0        checkmate_1.9.3      magrittr_1.5        
[19] annotate_1.54.0      Biostrings_2.44.2    readr_1.3.1         
[22] gmodels_2.18.1       prettyunits_1.0.2    colorspace_1.4-1    
[25] blob_1.1.1           rvest_0.3.4          ggrepel_0.8.1       
[28] xfun_0.8             dplyr_0.8.0.1        crayon_1.3.4        
[31] RCurl_1.95-4.12      jsonlite_1.6         genefilter_1.58.1   
[34] phylobase_0.8.6      survival_2.44-1.1    iterators_1.0.10    
[37] ape_5.3              glue_1.3.1           gtable_0.3.0        
[40] zlibbioc_1.22.0      XVector_0.16.0       webshot_0.5.1       
[43] seqinr_3.4-5         questionr_0.7.0      dunn.test_1.3.5     
[46] adegraphics_1.0-15   scales_1.0.0         DBI_1.0.0           
[49] miniUI_0.1.1.1       Rcpp_1.0.1           htmlTable_1.13.1    
[52] viridisLite_0.3.0    xtable_1.8-4         progress_1.2.2      
[55] spData_0.3.0         bit_1.1-14           foreign_0.8-71      
[58] spdep_0.7-9          Formula_1.2-3        htmlwidgets_1.3     
[61] httr_1.4.0           RColorBrewer_1.1-2   acepack_1.4.1       
[64] pkgconfig_2.0.2      XML_3.98-1.20        nnet_7.3-12         
[67] deldir_0.1-16        locfit_1.5-9.1       labeling_0.3        
[70] AnnotationDbi_1.38.2 tidyselect_0.2.5     rlang_0.3.1         
[73] reshape2_1.4.3       later_0.8.0          munsell_0.5.0       
 [ reached getOption("max.print") -- omitted 56 entries ]

Acknowledgement

  • H3ABioNet
  • Uganda Virus Research Institute (UVRI), BCB Hub
  • UCT-CBiO

References

UVRI-H3ABioNet

7/9/2019